Data set 1: 10X purified cells

X = readMM("../data/10X_pbmc_filtered/matrix.mtx")
X = as.matrix(X)
X = t(X)
label = read.table("../data/10X_pbmc_filtered/truelabel.tsv")$V1
rm(orig)
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)

rm(X); gc()
##           used (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells 1749429 93.5    3683070  196.7    2783514   148.7
## Vcells 4756188 36.3 1096746803 8367.6 1784982126 13618.4
table(label)
## label
##               B            cd34      cytotoxicT         helperT 
##            1355            3094            1906            1035 
##         memoryT       monocytes naivecytotoxicT          naiveT 
##            1716              88            1292             456 
##              NK     regulatoryT 
##            1443             689
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0") +
  ggtitle("dropout rates vs gene mean")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B", "helperT")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("two celltypes selected")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B", "helperT")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ylim(c(0,1)) + 
  ggtitle("fitted exponential decay")



df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ggtitle("normalized") + 
  ylim(c(0,1))

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

Data set 2 : 10X 68K cells

X = readMM("../data/10X_68K/filtered_matrix/hg19/matrix.mtx")
barcodes = read.table("../data/10X_68K/filtered_matrix/hg19/barcodes.tsv", sep = "\t")
genes = read.table("../data/10X_68K/filtered_matrix/hg19/genes.tsv", sep = "\t")
annotation = read.table("../data/10X_68K/filtered_matrix/hg19/barocdes_annotation.tsv", sep = "\t", header = TRUE)
label = annotation$celltype

X = t(as.matrix(X))
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)

rm(X); gc()
##           used  (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells 2064127 110.3    3683070   196.7    3683070   196.7
## Vcells 6484855  49.5 5825166023 44442.5 7584762096 57867.2
table(label)
## label
##               CD14+ Monocyte                      CD19+ B 
##                         2862                         5908 
##                        CD34+               CD4+ T Helper2 
##                          277                           97 
##              CD4+/CD25 T Reg   CD4+/CD45RA+/CD25- Naive T 
##                         6187                         1873 
##          CD4+/CD45RO+ Memory                     CD56+ NK 
##                         3061                         8776 
##             CD8+ Cytotoxic T CD8+/CD45RA+ Naive Cytotoxic 
##                        20773                        16666 
##                    Dendritic 
##                         2099
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0") +
  ggtitle("dropout rates vs gene mean")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("CD19+ B", "CD4+/CD45RO+ Memory")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("two celltypes selected")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("CD19+ B", "CD4+/CD45RO+ Memory")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ylim(c(0,1)) + 
  ggtitle("fitted exponential decay")


df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ggtitle("normalized") + 
  ylim(c(0,1))

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

Data Set 3 : GSE76381 Developmental Midbrain

x = fread("../data/GSE76381/GSE76381_ESMoleculeCounts.cef.txt.gz")
X = x[-(1:4), -2]
X = matrix(NA, nrow(X)-1, ncol(X) - 1)
label = as.character(x[3, -(1:2)])
X = (as.matrix(x[6:nrow(x), 3:ncol(x)]))
X = apply(X, 2, as.numeric)
rownames(X) = x$V1[-(1:5)]
colnames(X) = x[2, -(1:2)]
rm(x); gc()
##            used  (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells  2144155 114.6    3683070   196.7    3683070   196.7
## Vcells 65613692 500.6 4660132818 35554.0 7584762096 57867.2
X = t(X)
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)

rm(X); gc()
##            used  (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  2138958 114.3    3683070  196.7    3683070   196.7
## Vcells 35430539 270.4  977300684 7456.3 7584762096 57867.2
table(label)
## label
##    eNb1    eNb2    eNb3    eNb4 eProg1a eProg1b eProg2a eProg2b   eRgla 
##     121      84      42      45     244     263     126     118      67 
##   eRglb   eRglc   eRgld   eRgle   eRglf    eSCa    eSCb    eSCc 
##      68      55      47      45      69     126     117      78
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0") +
  ggtitle("dropout rates vs gene mean")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>%  filter(celltype %in% c("eNb1", "eSCa")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("two celltypes selected")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>%  filter(celltype %in% c("eNb1", "eSCa")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ylim(c(0,1)) + 
  ggtitle("fitted exponential decay")


df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ggtitle("normalized") + 
  ylim(c(0,1))

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

Data 4 : GSE114724 Diverse Immune Phenotypes in the Breast Tumor Microenvironment

subpart 1 : Patient BC09 Tumor 1

X = readRDS("../data/GSE114724/GSE114724_CLEANED/bc09_tumor1.rds")
label = read.table("../data/GSE114724/GSE114724_CLEANED/bc09_tumor1_label.txt", stringsAsFactors = FALSE)$V1
X = t(as.matrix(X))
df = preprocess(X, label=label, normalize = FALSE)
df2 = preprocess(X, label = label, normalize = TRUE)

rm(X); gc()
##            used  (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  2140890 114.4    3683070  196.7    3683070   196.7
## Vcells 34927001 266.5  500377949 3817.6 7584762096 57867.2
table(label)
## label
##                 B:        MACROPHAGE:              MAST: 
##                664                310                 10 
##               mDC: MONOCYTE:precursor        NEUTROPHIL: 
##                 97                355                 52 
##      NK:CD56+16+3-   NK:CD56+16+3+NKT                NKT 
##                 31                953                180 
##           T:CD4+CM        T:CD4+NAIVE           T:CD8+EM 
##                249               1290                331 
##        T:CD8+NAIVE 
##                765
tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0") +
  ggtitle("dropout vs gene mean")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(gene_mean<10)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

tmpdf = df %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B:", "NEUTROPHIL:")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g1 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("not normalized")

tmpdf = df2 %>% filter(dropout_rate < 0.95) %>% filter(dropout_rate > 0.05) %>% filter(celltype %in% c("B:", "NEUTROPHIL:")) %>% filter(gene_mean<4)
tmpdf = tmpdf[sample(1:nrow(tmpdf)), ]
g2 = ggplot(tmpdf, 
       aes(x=gene_mean, y=dropout_rate, col = celltype)) +
  geom_point(alpha = 0.4) + 
  ylab("drop out rate") +
  xlab("mean(X) for X>0")+
  ggtitle("normalized")

ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g1 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ylim(c(0,1)) + 
  ggtitle("not normalized")



df = df2
df$range = cut(df$dropout_rate, seq(0.01,0.99,length=99), include.lowest=TRUE)
df = df%>% group_by(range, celltype) %>% summarize(var = var(gene_mean), mean = mean(gene_mean))

df$range = as.character(df$range)
df$minrange = substring(sapply(strsplit(df$range, ','), function(x) x[1]), 2)
df$maxrange = gsub('.{1}$', '', sapply(strsplit(df$range, ','), function(x) x[2]) )
df$minrange = as.numeric(df$minrange)
df$maxrange = as.numeric(df$maxrange)
df$mean_dropout_rate = (df$minrange + df$maxrange) / 2

fitdf = df %>% group_by(celltype) %>%  do(decayfit = nls(mean_dropout_rate ~ SSasymp(mean, yf, y0, log_alpha), data=.))
decayfit_tidy = broom::tidy(fitdf, decayfit)

fitted = list()

newdf = data.frame(range = NA, celltype = NA, var = NA, mean = NA, minrange = NA, maxrange = NA, mean_dropout_rate = NA, fitted = NA)
for (t in 1:length(table(label))){
  tmp = df %>% filter(celltype==names(table(label))[t])
  tmp$fitted = predict(fitdf$decayfit[t][[1]], newdata = tmp)
  newdf = rbind(newdf, as.data.frame(tmp))
}
newdf = newdf[-1, ]

ind = sort(newdf$mean_dropout_rate, index.return=TRUE)$ix
newdf = newdf[ind, ]
newdf = newdf %>% filter(mean < 7)
g2 = ggplot(newdf%>% filter(mean<7), aes(x=mean, y=mean_dropout_rate, col=celltype)) + 
  geom_point(alpha = 0.5) + geom_line(aes(x=mean, y=fitted, col=celltype), lwd=2, alpha = 0.3) +
  ylab("dropout rate") +
  xlab("gene mean") +
  ggtitle("normalized") + 
  ylim(c(0,1))


ggarrange(g1, g2, nrow=1, ncol=2, common.legend=TRUE)

subpart 2 : Patient BC09 Tumor 2

##            used  (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  2139243 114.3    3683070  196.7    3683070   196.7
## Vcells 34874919 266.1  553577148 4223.5 7584762096 57867.2
## label
##                 B:        MACROPHAGE:              MAST: 
##                704                312                 14 
##               mDC: MONOCYTE:precursor        NEUTROPHIL: 
##                 66                340                 42 
##      NK:CD56+16+3-   NK:CD56+16+3+NKT                NKT 
##                 27               1108                167 
##           T:CD4+CM        T:CD4+NAIVE           T:CD8+EM 
##                186               1236                347 
##        T:CD8+NAIVE 
##                707

subpart 3 : Patient BC10 Tumor 1

##            used  (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  2139894 114.3    3683070  196.7    3683070   196.7
## Vcells 34834739 265.8  354289374 2703.1 7584762096 57867.2
## label
##                 B:        MACROPHAGE:              MAST: 
##                242                203                 36 
##               mDC: MONOCYTE:precursor        NEUTROPHIL: 
##                327                288                182 
##      NK:CD56+16+3-   NK:CD56+16+3+NKT                NKT 
##                 27                111                120 
##           T:CD4+CM        T:CD4+NAIVE           T:CD8+EM 
##                210               1082                243 
##        T:CD8+NAIVE 
##                278

subpart 4 : Patient BC11 Tumor 1

##            used  (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  2140861 114.4    3683070  196.7    3683070   196.7
## Vcells 34944102 266.7  470466484 3589.4 7584762096 57867.2
## label
##                 B:        MACROPHAGE:              MAST: 
##                450                376                 39 
##               mDC: MONOCYTE:precursor        NEUTROPHIL: 
##                116                 52                 70 
##      NK:CD56+16+3-   NK:CD56+16+3+NKT                NKT 
##                 45                178                395 
##           T:CD4+CM        T:CD4+NAIVE           T:CD8+EM 
##                112               1245                616 
##        T:CD8+NAIVE 
##                533

subpart 5 : Patient BC11 Tumor 2

##            used  (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  2141564 114.4    3683070  196.7    3683070   196.7
## Vcells 34897125 266.3  376373187 2871.5 7584762096 57867.2
## label
##                 B:        MACROPHAGE:              MAST: 
##                389                359                 70 
##               mDC: MONOCYTE:precursor        NEUTROPHIL: 
##                 83                 32                 49 
##      NK:CD56+16+3-   NK:CD56+16+3+NKT                NKT 
##                 47                157                308 
##           T:CD4+CM        T:CD4+NAIVE           T:CD8+EM 
##                103               1107                529 
##        T:CD8+NAIVE 
##                463